Aside: Baseline of Zero?

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) + 
  geom_point() + 
  xlim(c(0, 60)) + 
  ylim(c(0, 25))
## Warning: Removed 2 rows containing missing values (geom_point).

## length vs relative position: people see bar plots as being comparisons of length, while a line plot or scatterplot can have a different scale because people can conceptualize the relative difference

Section 5.1 Exercises: Billboard

The data set only contains a few variables:

library(billboard)
head(wiki_hot_100s)
##   no                     title              artist year
## 1  1 Theme from A Summer Place         Percy Faith 1960
## 2  2          He'll Have to Go          Jim Reeves 1960
## 3  3             Cathy's Clown The Everly Brothers 1960
## 4  4              Running Bear      Johnny Preston 1960
## 5  5                Teen Angel        Mark Dinning 1960
## 6  6                 I'm Sorry          Brenda Lee 1960
tail(wiki_hot_100s)
##       no                   title                             artist year
## 5696  95 Adventure of a Lifetime                           Coldplay 2016
## 5697  96         Humble and Kind                         Tim McGraw 2016
## 5698  97                  Wicked                             Future 2016
## 5699  98           Tiimmy Turner                          Desiigner 2016
## 5700  99           See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100                 Perfect                      One Direction 2016
max(wiki_hot_100s$year)
## [1] "2016"
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tidyverse)

billboard_count <- wiki_hot_100s %>% filter(year >=2000 & year < 2010) %>% 
  group_by(artist) %>% 
  summarise(ncount = n()) %>% 
  arrange(desc(ncount)) %>% 
  slice(1:10) %>% 
  mutate(artist_ordered = fct_reorder(.f = artist, .x = ncount))

ggplot(data = billboard_count, aes(x = ncount, y = artist_ordered, fill = artist_ordered)) + 
  geom_col() + 
  scale_fill_viridis_d()

Exercises

Exercise 1. Make the visualization that we sketched in class. We will complete this exercise as a class. (done)

Exercise 2. There is a minor flaw in the way that we counted up the number of hits for each artist. Examine the 2nd to last row of the original data set with tail() to look at this potential flaw. What do you find?

Any song that features an artist will not count toward their total top 100 songs.

Exercise 3. Challenging. Fix the issue in Exercise 2. May want to skip this question and come back after completing the other exercises.

sep_artist <- wiki_hot_100s %>% separate(col = artist, into = c("artist1", "artist2"), sep = " featuring ") %>% 
  pivot_longer(cols = c(artist1, artist2), names_to = "name", values_to = "artist") %>% 
  filter(!is.na(artist)) %>% 
  select(-name)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 5117 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

Exercise 4. Change the plot from Exercise 1 to be a Lollipop chart using this website as a reference. Why might the lollipop chart be better than a bar plot?

ggplot(data = billboard_count, aes(x = ncount, y = artist_ordered)) + 
  geom_point() + 
  geom_segment(aes(x = ncount, xend = ncount, y = 0, yend = artist_ordered))

With a lollipop chart, it is easier to compare which artists had the same number of top 100 hits over the 10 year period. Higher data-to-ink ratio.

Exercise 5. Use this website to customize the end points of your lollipop chart. If you have time, you can explore the other customization options. Make it look fancy!

ggplot(data = billboard_count, aes(x = ncount, y = artist_ordered)) + 
  geom_point(colour="orange", size=4) + 
  geom_segment(aes(x = ncount, xend = ncount, y = 0, yend = artist_ordered), colour = "grey")

## provide the URL and name it something (in this case, url).
## paste0 pastes together the base URL and the year into a single string:
## this will be useful in a moment
year <- 2017

url <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
h <- url %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html()  

## grabs the tables
tab <- h %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
  mutate(year = 2017)
df
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
get_wiki_100 <- function(year) {
  
  ## same code as before, replacing 2017 with year.
url <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
h <- url %>% 
  httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>% 
  read_html() 
  
  tab <- h %>% html_nodes("table")
  df <- tab[[1]] %>% html_table() %>%
    mutate(year = year)
  
  ## tell our function to return the dataframe `df`
  return(df) 
}
get_wiki_100(year = 2017)
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
library(purrr)
year_list <- list(2017, 2018, 2019, 2020, 2021)
year_list
## [[1]]
## [1] 2017
## 
## [[2]]
## [1] 2018
## 
## [[3]]
## [1] 2019
## 
## [[4]]
## [1] 2020
## 
## [[5]]
## [1] 2021
df_all <- map(year_list, get_wiki_100)
df_all ## a list of data frames, one for each year
## [[1]]
## # A tibble: 100 × 4
##      No. Title                          `Artist(s)`                         year
##    <int> <chr>                          <chr>                              <dbl>
##  1     1 "\"Shape of You\""             Ed Sheeran                          2017
##  2     2 "\"Despacito (Remix)\""        Luis Fonsi and Daddy Yankee featu…  2017
##  3     3 "\"That's What I Like\""       Bruno Mars                          2017
##  4     4 "\"Humble\""                   Kendrick Lamar                      2017
##  5     5 "\"Something Just Like This\"" The Chainsmokers and Coldplay       2017
##  6     6 "\"Bad and Boujee\""           Migos featuring Lil Uzi Vert        2017
##  7     7 "\"Closer\""                   The Chainsmokers featuring Halsey   2017
##  8     8 "\"Body Like a Back Road\""    Sam Hunt                            2017
##  9     9 "\"Believer\""                 Imagine Dragons                     2017
## 10    10 "\"Congratulations\""          Post Malone featuring Quavo         2017
## # … with 90 more rows
## 
## [[2]]
## # A tibble: 100 × 4
##      No. Title                `Artist(s)`                                year
##    <int> <chr>                <chr>                                     <dbl>
##  1     1 "\"God's Plan\""     Drake                                      2018
##  2     2 "\"Perfect\""        Ed Sheeran                                 2018
##  3     3 "\"Meant to Be\""    Bebe Rexha featuring Florida Georgia Line  2018
##  4     4 "\"Havana\""         Camila Cabello featuring Young Thug        2018
##  5     5 "\"Rockstar\""       Post Malone featuring 21 Savage            2018
##  6     6 "\"Psycho\""         Post Malone featuring Ty Dolla Sign        2018
##  7     7 "\"I Like It\""      Cardi B, Bad Bunny and J Balvin            2018
##  8     8 "\"The Middle\""     Zedd, Maren Morris and Grey                2018
##  9     9 "\"In My Feelings\"" Drake                                      2018
## 10    10 "\"Girls Like You\"" Maroon 5 featuring Cardi B                 2018
## # … with 90 more rows
## 
## [[3]]
## # A tibble: 100 × 4
##      No. Title               `Artist(s)`                          year
##    <int> <chr>               <chr>                               <dbl>
##  1     1 "\"Old Town Road\"" Lil Nas X featuring Billy Ray Cyrus  2019
##  2     2 "\"Sunflower\""     Post Malone and Swae Lee             2019
##  3     3 "\"Without Me\""    Halsey                               2019
##  4     4 "\"Bad Guy\""       Billie Eilish                        2019
##  5     5 "\"Wow\""           Post Malone                          2019
##  6     6 "\"Happier\""       Marshmello and Bastille              2019
##  7     7 "\"7 Rings\""       Ariana Grande                        2019
##  8     8 "\"Talk\""          Khalid                               2019
##  9     9 "\"Sicko Mode\""    Travis Scott                         2019
## 10    10 "\"Sucker\""        Jonas Brothers                       2019
## # … with 90 more rows
## 
## [[4]]
## # A tibble: 100 × 4
##      No. Title                   `Artist(s)`                   year
##    <int> <chr>                   <chr>                        <dbl>
##  1     1 "\"Blinding Lights\""   The Weeknd                    2020
##  2     2 "\"Circles\""           Post Malone                   2020
##  3     3 "\"The Box\""           Roddy Ricch                   2020
##  4     4 "\"Don't Start Now\""   Dua Lipa                      2020
##  5     5 "\"Rockstar\""          DaBaby featuring Roddy Ricch  2020
##  6     6 "\"Adore You\""         Harry Styles                  2020
##  7     7 "\"Life Is Good\""      Future featuring Drake        2020
##  8     8 "\"Memories\""          Maroon 5                      2020
##  9     9 "\"The Bones\""         Maren Morris                  2020
## 10    10 "\"Someone You Loved\"" Lewis Capaldi                 2020
## # … with 90 more rows
## 
## [[5]]
## # A tibble: 100 × 4
##      No. Title                                `Artist(s)`                   year
##    <int> <chr>                                <chr>                        <dbl>
##  1     1 "\"Levitating\""                     Dua Lipa                      2021
##  2     2 "\"Save Your Tears\""                The Weeknd and Ariana Grande  2021
##  3     3 "\"Blinding Lights\""                The Weeknd                    2021
##  4     4 "\"Mood\""                           24kGoldn featuring Iann Dior  2021
##  5     5 "\"Good 4 U\""                       Olivia Rodrigo                2021
##  6     6 "\"Kiss Me More\""                   Doja Cat featuring SZA        2021
##  7     7 "\"Leave the Door Open\""            Silk Sonic (Bruno Mars and …  2021
##  8     8 "\"Drivers License\""                Olivia Rodrigo                2021
##  9     9 "\"Montero (Call Me by Your Name)\"" Lil Nas X                     2021
## 10    10 "\"Peaches\""                        Justin Bieber featuring Dan…  2021
## # … with 90 more rows
df_2017_present <- bind_rows(df_all)
df_2017_present <- df_2017_present %>%
  mutate(Title = str_remove_all(Title, pattern = "\"")) %>% ## get rid of \ in title
  rename(no = No., 
         title = Title, 
         artist = `Artist(s)`) ## make column names match with billboard package

wiki_tibble <- as_tibble(wiki_hot_100s) %>% ## convert billboard data to tibble
  mutate(year = as.numeric(year),
         no = as.integer(no)) ## change variable types to match with scraped data
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
hot100_df <- bind_rows(wiki_tibble, df_2017_present)

Exercise 6. Use the hot100_df to make either a bar plot or a lollipop chart of the most popular artists of the 2010s (2010 through 2019). It may be helpful to make this plot without looking back at the code for the 2000s plot until you get stuck.

top10_df <- hot100_df %>% filter(year <=2019 & year >= 2010) %>% 
  group_by(artist) %>% 
  summarise(nsongs = n()) %>% 
  arrange(desc(nsongs)) %>% 
  slice(1:10) %>% 
  mutate(artist = fct_reorder(artist, nsongs))

ggplot(data = top10_df, aes(x = nsongs, y = artist)) + 
  geom_point() + 
  geom_segment(aes(x = 0, xend = nsongs, y = artist, yend = artist))

Exercise 7. Much of the code to scrape the data, using purrr to iterate over the scrape, and then combining the list of data frames to a single data frame may be new. It is not expected that you are able to write this code on your own, but you should have an overall understanding of what the code is doing. Write 2-3 sentences that summarizes the overall purpose of the rvest and purrr code.

The rvest package scrapes data from the Wikipedia website while the purrr package helps us to use our function over and over instead of typing out every year we want in order to get a full data set with all the info we want.

Exercise 8. If you have time, use purrr and rvest to read different data from Wikipedia into R. For example, you might consider sports championship data, which will typically have a different Wikipedia page for each year or season.

Section 5.2: HPI

library(tidyverse)
hpi_df <- read_csv("data/hpi-tidy.csv")
## Rows: 151 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, GovernanceRank, Region
## dbl (8): HPIRank, LifeExpectancy, Wellbeing, HappyLifeYears, Footprint, Happ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hpi_df
## # A tibble: 151 × 11
##    HPIRank Country     LifeExpectancy Wellbeing HappyLifeYears Footprint
##      <dbl> <chr>                <dbl>     <dbl>          <dbl>     <dbl>
##  1     109 Afghanistan           48.7      4.76           29.0     0.540
##  2      18 Albania               76.9      5.27           48.8     1.81 
##  3      26 Algeria               73.1      5.24           46.2     1.65 
##  4     127 Angola                51.1      4.21           28.2     0.891
##  5      17 Argentina             75.9      6.44           55.0     2.71 
##  6      53 Armenia               74.2      4.37           41.9     1.73 
##  7      76 Australia             81.9      7.41           65.5     6.68 
##  8      48 Austria               80.9      7.35           64.3     5.29 
##  9      80 Azerbaijan            70.7      4.22           39.1     1.97 
## 10     146 Bahrain               75.1      4.55           43.5     6.65 
## # … with 141 more rows, and 5 more variables: HappyPlanetIndex <dbl>,
## #   Population <dbl>, GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()

hpi_us <- hpi_df %>% filter(Country == "United States of America")
hpi_us
## # A tibble: 1 × 11
##   HPIRank Country              LifeExpectancy Wellbeing HappyLifeYears Footprint
##     <dbl> <chr>                         <dbl>     <dbl>          <dbl>     <dbl>
## 1     105 United States of Am…           78.5      7.16           61.3      7.19
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## #   GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label(data = hpi_us, aes(label = Country)) ## specify

## data = hpi_us so geom_label only uses the observation in hpi_us
library(ggrepel)
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1) ## create a second point that is an open circle (shape = 1) with a larger size (size = 3) to surround the United States point on the scatterplot

Exercise 1. Change the code to label 3 countries of interest. Recall that you will need to use the | operator in the dplyr::filter() function.

hpi_interest <- hpi_df %>% filter(Country == "Afghanistan" | Country == "Albania" | Country == "Algeria")

ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_interest, aes(label = Country)) +
  geom_point(data = hpi_interest, size = 3, shape = 1)

plotly

## install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point()
ggplotly(plot1)
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing,
                                   label = Country)) +
  geom_point()
ggplotly(plot1, tooltip = "label")
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
  geom_point() +
  geom_label_repel(data = hpi_us, aes(label = Country)) +
  geom_point(data = hpi_us, size = 3, shape = 1) +
  labs(title = "Countries with a Higher Ecological Footprint Tend to Have Citizens with Higher Wellbeing", ## add title
    subtitle = "Wellbeing is on a 1-10 scale", ## add subtitle (smaller text size than the title)
    caption = "Data Source: http://happyplanetindex.org/countries", ## add caption to the bottom of the figure
    x = "Ecological Footprint", ## change x axis label
    y = "Wellbeing") ## change y axis label

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_brewer(palette = "Accent")

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
  geom_point() +
  scale_colour_viridis_d(option = "plasma")

ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears)) +
  geom_point() +
  facet_wrap( ~ Region)

library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point(aes(colour = species)) ## colour is good enough here
## Warning: Removed 2 rows containing missing values (geom_point).

ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
  geom_point() +
  facet_wrap( ~ species) ## faceting probably unnecessary: colour is better
## Warning: Removed 2 rows containing missing values (geom_point).

colour_bad <- tibble(x = rnorm(500, 0, 1), y = rnorm(500, 0, 1),
       groupvar = c(rep("A", 50), rep("B", 50),
                    rep("C", 50), rep("D", 50),
                    rep("E", 50), rep("F", 50), rep("G", 50),
                    rep("H", 50), rep("I", 50), rep("J", 50)))

ggplot(data = colour_bad, aes(x = x, y = y, colour = groupvar)) +
  geom_point() + ## can't distinguish anything really: colour is bad
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = colour_bad, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap( ~ groupvar) ## faceting better
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'